knitr::opts_chunk$set(echo = TRUE, message = F)
library(magrittr) # usefull for pipe %>% operators
library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
library(data.table)
library(ggplot2)
library(gridExtra)
library(evd)
library(extRemes)
source("1UsedFunc.R") # To call our created Functions, stored for better smoothness. We will name them "our" functions, to recognize them in the following.The (re)-constructed functions stored in “1UsedFunc.R”. We decided to show it here for completeness, but this is probably not necessary to look into this.
NOMSG <- function(x) invisible(capture.output(x)) # Just to tell Rmarkdown not to print useful outputs for some objects : avoid a too long report...
# Theme function for our builded ggplots, to get coherent colours, themes, etc...
# To minimize length of the ggplot's calls.
"theme_piss" <- function(size_p = 18, size_c = 14, theme = theme_bw()){
theme(plot.title = element_text(size = size_p, hjust=0.5,
colour = "#33666C", face="bold")) +
theme(axis.title = element_text(face = "bold", size= size_c,
colour = "#33666C")) +
theme
}
# This function aims to retrieve specific seasons from month standing
# in a particular dataset
func_season <- function(month){
if (month %in% c(01, 02, 03))
return("Winter")
else if (month %in% c(04, 05, 06))
return( "Spring" )
else if (month %in% c(07, 08, 09))
return("Summer")
else
return("Autumn")
}
# Function to retrieve the (yearly) extremes for a given list of extrema
# The two last arguments specifying if we want minima (Fun=min, tmp='TN')
# This returns the data AND data.frame with the corresponding year.
# Easier for ggplots
"yearly.extrm" <-
function(list.y = list_by_years, Fun = max, tmp = 'TX'){
max_years <- matrix(nrow = 116, ncol = 2)
for (i in 1:116) {
max_years[i,1] <- Fun(list.y[[i]][tmp])
}
max_years[,2] <- seq(1901, 2016 ,by = 1)
colnames(max_years) <- c("Max","Year")
max_data <- max_years[,"Max"]
if (tmp == "TN") colnames(max_years) <- c("Min", "Year")
max_frame <- as.data.frame(max_years)
out <- list(max_data, max_frame) ; names(out) <- c("data", "df")
return(out)
}
#yearly.extrm()$df
# Computes the Hill estimates of gamma (Section 5.2.3)
# for a numeric vector of observations (data) and as a function of k
# If plot=TRUE then the estimates are plotted as a function of k
# If add=TRUE then the estimates are added to an existing plot
"genHill" <- function(data, gamma, plot=FALSE, add=FALSE, ...) {
X <- sort(data) ; n <- length(X) ; UH.scores <- numeric(n)
Hill <- numeric(n) ; K <- 1:(n-1)
### Hill estimates
for (i in 1:(n-1)) {
UH.scores[i] <- X[n-i]*gamma[i]
}
for (k in 1:(n-2)) {
Hill[k] <- sum(log(UH.scores[1:k])-log(UH.scores[k+1]))/k
print(k)
}
### plots if = TRUE
if (plot || add){
if ( !add ) { ### plot estimates
plot(K, Hill[K], type="l", ylab="gamma", xlab="k",
main="Estimates of extreme value index", ...)
}
else { ### adds estimates to existing plot
lines(K, Hill[K], ...)
}
} ### output list with values of k and corresponding Zipf estimates
list(k=K, gamma=Hill[K])
}
# To compute the plots of both ACF and PACF
'acfpacf_plot' <- function(data, lagmax = length(data)){
par(mfrow = c(2, 1))
acf(data, lag.max = lagmax)
pacf(data, lag.max = lagmax)
}
"gev.rl.gradient" <- function (a, p) {
# scale <- z$mle[2]
# shape <- z$mle[3]
scale <- a[2]
shape <- a[3]
if (shape < 0)
zero.p <- p == 0
else zero.p <- logical(length(p))
out <- matrix(NA, nrow = 3, ncol = length(p))
out[1, ] <- 1
if (any(zero.p)) {
out[2, zero.p & !is.na(zero.p)] <- rep(-shape^(-1), sum(zero.p,
na.rm = TRUE))
out[3, zero.p & !is.na(zero.p)] <- rep(scale * (shape^(-2)),
sum(zero.p, na.rm = TRUE))
}
if (any(!zero.p)) {
yp <- -log(1 - p[!zero.p])
out[2, !zero.p] <- -shape^(-1) * (1 - yp^(-shape))
out[3, !zero.p] <- scale * (shape^(-2)) * (1 - yp^(-shape)) -
scale * shape^(-1) * yp^(-shape) * log(yp)
}
return(out)
}
# Return Levels with ggplot !
"rl_piss" <- function(a, mat, dat){
eps <- 1e-006
a1 <- a ; a2 <- a ; a3 <- a
a1[1] <- a[1] + eps ; a2[2] <- a[2] + eps ; a3[3] <- a[3] + eps
f <- c(seq(0.01, 0.999, length = length(dat)))
q <- gevq(a, 1 - f)
d <- t( gev.rl.gradient( a=a, p=1-f))
v <- apply(d, 1, q.form, m = mat)
df <- data.frame(x = -1/log(f), y = -1/log(f), q = q,
upp = q + 1.96 * sqrt(v), low = q - 1.96 * sqrt(v),
point = -1/log((1:length(dat))/(length(dat) + 1)),
pdat = sort(dat))
# Plot it
ggplot(df) + coord_cartesian(xlim = c(0.1, 1000)) +
geom_line(aes(x = y, y = q), col = "#33666C") +
geom_line(aes( x = y, y = low), col = "red") +
geom_line(aes (x = y, y = upp), col = "red") +
scale_x_log10() + ggtitle(" Return Level plot") +
geom_point(aes(x = point, y = pdat), col = "#33666C", shape = 1) +
theme_piss()
}
# Compute return levels of nonstationary model with the data (in years)
'return.lvl.nstatio' <-
function(data_year, gev_nstatio, t = 100, m = 10 ){
y_m <- -(1 / log(1 - 1/m))
t <- seq(max(data_year), max(data_year) + t, 1)
rl_m <- (gev_nstatio$mle[1] + gev_nstatio$mle[2] *
(t - max(max_years$df$Year))) +
(gev_nstatio$mle[3] / gev_nstatio$mle[4]) *
(y_m^gev_nstatio$mle[4] - 1)
g <- ggplot(data.frame(r.lvels = rl_m, years = t)) +
geom_point(aes(x = years, y = r.lvels))
print(g)
return(rl_m)
}
# Rearranging diplot() function from POT because there were speed problems.
'diplot_fast' <-
function (data, u.range, nt = max(200, nrow(data)))
{
data <- na.omit(data) ; date <- data[, "time"] ; samp <- data[, "obs"]
M <- diff(range(date))
thresh <- seq(u.range[1], u.range[2], length = nt)
DI <- NULL ; date <- floor(date) ; time.rec <- length(date)
for (u in thresh) {
nb.occ <- NULL
idx.excess <- samp > u
lambda <- sum(idx.excess)/M
for (year in 1:time.rec){
nb.occ <- c(nb.occ, sum(idx.excess & (date == year)))
#print(year)
}
DI <- c(DI, var(nb.occ)/lambda)
#print(u)
}
conf_sup <- qchisq(1 - (1 - .95)/2, M - 1)/(M - 1)
conf_inf <- qchisq((1 - .95)/2, M - 1)/(M - 1)
main <- "Dispersion Index Plot"
xlab <- "Threshold"
ylab <- "Dispersion Index"
plot(c(thresh, thresh[1]), c(DI, conf_sup), xlab = xlab,
ylab = ylab, type = "n", main = main)
rect(0, conf_inf, 2 * u.range[2], conf_sup, col = "lightgrey",
border = FALSE)
lines(thresh, DI)
return(invisible(list(thresh = thresh, DI = DI)))
}
####################### For Bayesian #######################
'log_post' <- function(mu, logsig, xi, data) {
for (i in 1:length(data)){
m[i] <- min((1 + (xi * (data[i] - mu)/logsig)))
}
if (!is.na(m[i] <= 0)) return(as.double(-1e10))
llhd <- sum(dgev(data, loc = mu, scale = exp(logsig),
shape = xi), log = TRUE)
lprior <- dnorm(mu, sd = 10, log = TRUE)
lprior <- lprior + dnorm(logsig, sd = 10, log = TRUE)
lprior + llhd
}
####################### For Neural Nets #######################
# Modifify source function to avoid unesful printings
'gevcdn.fit2' <- function (x, y, iter.max = 1000, n.hidden = 2, Th = gevcdn.logistic,
fixed = NULL, init.range = c(-0.25, 0.25), scale.min = .Machine$double.eps,
beta.p = 3.3, beta.q = 2, sd.norm = Inf, n.trials = 5, method = c("BFGS",
"Nelder-Mead"), max.fails = 100, ...)
{
if (!is.matrix(x))
stop("\"x\" must be a matrix")
if (!is.matrix(y))
stop("\"y\" must be a matrix")
method <- match.arg(method)
if (identical(Th, gevcdn.identity))
n.hidden <- 3
GML <- Inf
x.min <- apply(x, 2, min)
x.max <- apply(x, 2, max)
x <- sweep(sweep(x, 2, x.min, "-"), 2, x.max - x.min, "/")
y.min <- min(y)
y.max <- max(y)
y <- (y - y.min)/(y.max - y.min)
for (i in seq_len(n.trials)) {
exception <- TRUE
exception.count <- 0
while (exception) {
if (identical(names(init.range), c("W1", "W2"))) {
weights <- unlist(init.range) + gevcdn.initialize(x,
n.hidden, c(-0.25, 0.25))
}
else {
weights <- gevcdn.initialize(x, n.hidden, init.range)
}
fit.cur <- try(suppressWarnings(optim(weights, gevcdn.cost,
method = method, control = list(maxit = iter.max,
...), x = x, y = y, n.hidden = n.hidden, Th = Th,
fixed = fixed, scale.min = scale.min, beta.p = beta.p,
beta.q = beta.q, sd.norm = sd.norm)), silent = TRUE)
if (!class(fit.cur) == "try-error") {
exception <- fit.cur$value > 1e+308
}
if (exception)
exception.count <- exception.count + 1
if (exception.count == max.fails) {
stop("exception... check arguments")
}
}
GML.cur <- fit.cur$value
if (GML.cur < GML) {
GML <- GML.cur
output.cdn <- fit.cur
}
}
weights <- output.cdn$par
cost <- gevcdn.cost(weights, x, y, n.hidden, Th, fixed, scale.min,
beta.p, beta.q, sd.norm)
w <- gevcdn.reshape(x, weights, n.hidden)
attr(w, "x.min") <- x.min
attr(w, "x.max") <- x.max
attr(w, "y.min") <- y.min
attr(w, "y.max") <- y.max
attr(w, "Th") <- Th
attr(w, "fixed") <- fixed
attr(w, "scale.min") <- scale.min
NLL <- attr(cost, "NLL")
penalty <- attr(cost, "penalty")
attr(w, "GML") <- GML
attr(w, "NLL") <- NLL
attr(w, "penalty") <- penalty
if (sd.norm == Inf) {
if (length(fixed) == 3) {
k <- 3
}
else {
if (identical(Th, gevcdn.identity)) {
k <- (3 - length(fixed)) * (ncol(x) + 1) + length(fixed)
}
else {
k <- length(weights) - n.hidden * length(fixed)
}
}
n <- nrow(y)
BIC <- 2 * NLL + k * log(n)
AIC <- 2 * NLL + 2 * k
AICc <- AIC + (2 * k * (k + 1))/(n - k - 1)
attr(w, "BIC") <- BIC
attr(w, "AIC") <- AIC
attr(w, "AICc") <- AICc
attr(w, "k") <- k
}
return(w)
}
################ For Bootstrap (GPD) ################
# Function ton compute our estimator
mle.ksiFun <- function(x) {
fit0 <- ismev::gpd.fit(xdat = x, threshold = 0, show = F)
fit <- fit0$mle[2] ; return(fit)
}
# Function to compute standard error of our estimator
sdmle.ksiFun <- function(x) {
fit0 <- ismev::gpd.fit(xdat = x, threshold = 0, show = F)
fit <- fit0$se[2] ; return(fit)
}
##############################################We will mark in comments some results that are important but which would make the report too long. We will also use the created (see above) function NOMSG() to hide unuseful printed results of some functions.
This is not linked with the further practical analysis but we decide to just put the final general (which will probably appear in the final text ?!) plot representing the 3 GEV distributions of interests. Furthermore, it will be interesting to check visually for the tail behaviour of our fitted model, \(\cdots\)
'GEVdfFun' <-
function (x = seq(-10, 10, length = 10^4), mu = 0, sig = 1, ksi = 0) {
# Manually
if (ksi ==0) { dens <- exp(-x) * exp(-exp(-x)) }
else
s <- (1 + ksi * (x - mu)/sig)^(-(ksi)^-1 - 1)
t <- (1 + ksi * (x - mu)/sig)^(-(ksi)^-1)
if (ksi < 0) {dens <- s * exp(-t) * ( (x - mu)/sig < -1/ksi ) }
if (ksi > 0) {dens <- sig^{-1} * s * exp(-t) * ( (x - mu)/sig > -1/ksi ) }
df <- data.frame(x = x, density = dens,xi = as.factor(ksi),
mu = as.factor(mu), scale = as.factor(sig))
return(df)
}
GEVdf <- rbind(GEVdfFun(ksi = -.5), GEVdfFun(ksi = 0), GEVdfFun(ksi = .5))
endpoint_w <- 0 - (1 / -0.5)
endpoint_f <- 0 - (1 / 0.5)
dens_f <- ifelse(GEVdf[GEVdf$xi == 0.5,]$density < endpoint_f, NA,
GEVdf[GEVdf$xi == 0.5,]$density )
GEVdf[GEVdf$xi == 0.5,]$density <- dens_f
# Plot normal distribution in black !!!!
GEVdf <- cbind(GEVdf, norm = dnorm(GEVdf$x, mean = 0, sd = 1))
GEVdf[GEVdf$density < 10^{-312}, ]$density <- NA
pres <- labs(title = expression(paste(underline(bold('Generalized Extreme Value density')), " (µ", '=0,', sigma, "=1)")), colour = expression(paste(xi,"=")),linetype = expression(paste(xi,"=")))
gf <- ggplot(GEVdf, aes(x = x, y = density, colour = xi )) +
geom_line() + pres + coord_cartesian(xlim = c(-6, 6)) +
geom_line(aes(x = x, y = norm, col = "normal"), col = "black", linetype = 3) +
theme_piss(20, 15, theme_classic() ) +
theme(legend.title = element_text(colour="#33666C",
size=18, face="bold")) +
theme(legend.key = element_rect(colour = "black")) +
guides(colour = guide_legend(override.aes = list(size = 2))) +
geom_point(aes(x = endpoint_f, y = 0),size = 3.5) +
geom_point(aes(x = endpoint_w, y = 0), col="red",size = 3.5)
gright <- ggplot(GEVdf, aes(x = x, y = density, colour = xi )) +
geom_line() + pres +
coord_cartesian(xlim = c(1.8, 5.5), ylim = c(0,.15)) +
geom_line(aes(x = x, y = norm, col = "normal"), col = "black", linetype = 3) +
theme_piss(18, 15, theme_minimal()) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
theme(axis.line = element_line(color="#33666C", size = .3)) +
theme(legend.title = element_text(colour="#33666C",
size=18, face="bold")) +
theme(axis.title = element_blank()) +
labs(title = expression(paste(bold("Right tails")))) +
geom_point(aes(x = endpoint_w, y = 0), col = "red", size = 3) +
scale_color_discrete(guide=F)
gleft <- ggplot(GEVdf, aes(x = x, y = density, colour = xi )) +
geom_line() + pres +
coord_cartesian(xlim = c(-0.8,-2.5), ylim = c(0,.2)) +
geom_line(aes(x = x, y = norm, col = "normal"), col = "black", linetype = 3) +
theme_piss(18, 15, theme_minimal()) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line = element_line(color="#33666C", size = .3),
legend.title = element_text(colour="#33666C",
size = 18, face="bold")) +
theme(axis.title = element_blank()) +
labs(title = expression(paste(bold("Left tails")))) +
geom_point(aes(x = endpoint_f, y = 0), size = 3) +
#theme(panel.border = element_rect(linetype = "dashed", colour = "black"))+
scale_color_discrete(guide=F)
library(grid)
vpleft <- viewport(width = 0.29,
height = 0.43,
x = 0.234,
y = 0.42)
vpright <- viewport(width = 0.28,
height = 0.415,
x = 0.775,
y = 0.43)
gf
print(gleft, vp = vpleft)
print(gright, vp = vpright)Representation of the Densities and their tails for the 3 GEV distributions together with the normal distribution as reference (dotted)
Plot the normal distribution (dotted lines) as “reference” ?
#################### Uccle ################
###########################################
# From 1833. Free dataset from the KNMI
TX_public <- read.csv("TX Uccle public.csv")
TX_public$TX <- TX_public$TX/10
## From IRM
TXTN_open <- read.csv('Uccle TX TN 1901.csv',sep="")
setnames(TXTN_open,"DAY","Date")
TXTN_closed <- read.csv('Uccle TX TN 1901 closed shelter.csv',sep="")
setnames(TXTN_closed,"DAY","Date")
#TXTN_closed$Date <- as.Date(TXTN_closed$Date)
#save(TXTN_closed,file="TXTN_closed.R")
TXTN_closed$day <- substr(TXTN_closed$Date,7,8)
TXTN_closed$month <- as.numeric(substr(TXTN_closed$Date,5,6))
TXTN_closed$year <- substr(TXTN_closed$Date,1,4)
# Retrieve seasons with our function
#Of course, we are based on meteorological seasons
TXTN_closed$season <- sapply(TXTN_closed$month, function(x) func_season(x))
##### Differences between the public and the IRM datasets ######
# remove missing values and start from 1901
TX_public_1901 <- TX_public[TX_public$DATE >= 19010101 & TX_public$Q_TX != 9,]
TXTN_open_compare <- TXTN_open[TXTN_open$Date < 20000101, ]
TXTN_closed_compare <- TXTN_closed[TXTN_closed$Date < 20000101, ]
diff_tx_open <- TX_public_1901$TX - TXTN_open_compare$TX
diff_tx_closed <- TX_public_1901$TX - TXTN_closed_compare$TX
library(psych)
#describe(diff_tx_closed)
#describe(diff_tx_open)
diff_tx_open <- data.frame(diff = diff_tx_open, method = 'open')
diff_tx <- rbind(diff_tx_open,
data.frame(diff = diff_tx_closed, method = "closed"))
#ggplot(diff_tx, aes(group = method)) + geom_boxplot(aes(y = diff, x = method))
sum(equals(diff_tx_open$diff, 0.0), na.rm = T) / length(diff_tx_open$diff)## [1] 0.5437097
sum(equals(diff_tx_closed, 0.0), na.rm = T) / length(diff_tx_closed)## [1] 0.1441135
These are the differences between the public data set (freely available, used by (Beirlant et al. 2006) ) and the dataset owned by the IRM (confidential), for instance :
Indeed, this could be problematic (…) and this seems really important to have reliable data to make relevant analysis.
# Insert "-" in dat so as they match date values in R
TXTN_closed$Date <- gsub('^(.{4})(.*)$', '\\1-\\2', TXTN_closed$Date)
TXTN_closed$Date <- gsub('^(.{7})(.*)$', '\\1-\\2', TXTN_closed$Date)
TXTN_closed$Date <- as.Date(TXTN_closed$Date)We decide to do (at first?) all the further analysis in closed shellter . It is better from a climatological point of view (following the IRM).
TXTN_closed$month <- as.factor(TXTN_closed$month )
############## Group temp by month
g1 <- ggplot(data = TXTN_closed, aes(group = month)) + geom_boxplot(aes(x = month, y = TX)) + theme_bw()
# Variance is quite similar accros months
# quite same distribution of the TX accross months
library(RColorBrewer)
# months
g11 <- ggplot(data = TXTN_closed, aes(TX, colour = month)) +
geom_density(size=1.1) + theme_bw() + scale_color_discrete()
# Note : we used same smoothing factor for all densities
# seasons
g2 <- ggplot(data=TXTN_closed,aes(TX,colour=season)) + geom_density(size=1.1) +
theme_bw()
g22 <- ggplot(data=TXTN_closed,aes(x=season,y=TX,group=season)) + geom_boxplot() + theme_bw()
grid.arrange(g11, g1, g2, g22, nrow = 2)Some comments can be made here :
Distributions accross months \(\approx\) similar but it is more spread for autumn and spring months than for summer/winter months. This is because these are “transition months” (\(\dots\))
Same arise for the seasons. This must already tell us that we have to be prudent regarding our modelling, e.g. for the block-length in GEV, or the (time-varying?) threshold in POT, \(\dots\)
However, these plots are rather intuitive so it is not necessary to make it long. We could have made much more other preliminary descriptive plots but we think it is not really useful though.
max_all <- TXTN_closed$TX
library(xts)
library(dygraphs)
library(zoo)
xtdata0 <- xts(TXTN_closed$TX, order.by = (TXTN_closed$Date), f = 12)
dygraph(xtdata0, main = "(Dynamic) Time series of TX in Uccle", xlab = "Date", ylab = "TX") %>% dyRangeSelector() “Dynamic” view of the series : Reduce the window length by scrolling the below cursor, for your convenience (or select directly the area with your mouse). For example here, take the minimal length to have a good visualisation (it displays ~ 2,5 years). Or select the area you want to see.
It is particularly interesting in large time series if we want to detect something we could expect a priori , or if one would want to easily inspect the whole serie for his knowledge. Here, nothing special draws our attention. We notice this reccurent (and relatively homogeneous) oscillation through time, corresponding to the seasons. (–> nonstationary, see later) We see that this oscillation as period of 1 year (obvious), so yearly-blocks for GEV seem acceptable.
# block length : we go for the usual method of 1 year
list_by_years <- split(TXTN_closed, TXTN_closed$year)
# 116 years of data. Now we will use the function created in UsedFunc.R
# To retrieve yearly extrema
####### MAX by year. We take our function to do that.
max_years <- yearly.extrm()
####### MIN by year
min_years <- yearly.extrm(Fun = min, tmp = "TN")Compare the two trends : for TX and for TN :
“Ugly” code for the plots in the following… you probably won’t want to press on “code”
summary(lm1 <- lm(max_years$data ~ max_years$df$Year))$Coefficient## NULL
# p-val 5.10^-5, trend (very) significant
lm1_1 <- lm1$coefficients[1]
lm1_2 <- lm1$coefficients[2]
lm_BL1 <- lm(max_years$data[1:75] ~ max_years$df$Year[1:75])
lm_BL2 <- lm(max_years$data[77:116] ~ max_years$df$Year[77:116])
g1 <- ggplot(data = max_years$df,aes(x=Year,y=Max)) + geom_line() +
geom_smooth(method='lm',formula=y~x, aes(colour = "Linear")) +
geom_line(data = max_years$df[max_years$df$Year %in% 1901:1975,],
aes(x = Year, colour = "BrokenLinear",
y = predict(lm_BL1)),
size = 1.5, linetype = "twodash") +
geom_line(data = max_years$df[max_years$df$Year %in% 1977:2016,],
aes(x = Year, colour = "BrokenLinear",
y = predict(lm_BL2)),
size = 1.5, linetype = "twodash") +
stat_smooth(method = "loess", se = F, aes(colour = 'LOESS')) +
labs(title = "Complete Serie of Annual TX in Uccle") +
theme_piss(20, 15) +
theme(axis.line = element_line(color="#33666C", size = .45)) +
scale_colour_manual(name="Trend",
values=c(Linear="blue", BrokenLinear="cyan", LOESS="red")) + theme(legend.position = c(.888, .152)) +
theme(legend.title = element_text(colour="#33666C",
size=12, face="bold"),
legend.background = element_rect(colour = "black"),
legend.key = element_rect(fill = "white")) +
guides(colour = guide_legend(override.aes = list(size = 2)))
# Red line is local polynomial regression fitting curve (loees)
# The (optimal) default method is convenient
### what for the minima ??
# as expected , trend is a bit less strong as for maxima
summary(lm_min <- lm(min_years$data ~ min_years$df$Year))##
## Call:
## lm(formula = min_years$data ~ min_years$df$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.256 -2.042 0.713 2.463 6.397
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -70.471139 18.606651 -3.787 0.000245 ***
## min_years$df$Year 0.030942 0.009499 3.257 0.001482 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.426 on 114 degrees of freedom
## Multiple R-squared: 0.08515, Adjusted R-squared: 0.07712
## F-statistic: 10.61 on 1 and 114 DF, p-value: 0.001482
# but it is still significant
g2 <- ggplot(data = min_years$df, aes(x=Year,y=Min)) + geom_line() +
geom_smooth(method='lm',formula=y~x) + theme_bw() +
stat_smooth(method = "loess", se = F, col = 'red' ) +
labs(title = "Complete Serie of Annual TN in Uccle") +
theme_piss(20, 15) +
theme(axis.line = element_line(color="#33666C", size = .45))
## Both TX and TN on the same page
grid.arrange(g1, g2, nrow = 2)We see with this method trend is a bit less significant upward trend for TN than for TX. The same arise for the two broked-linear trends (output not shown here).
See the statistical tests to compare two trends (and for the broken-linear trend)
We decide (for the moment?) to go on with the series of maximum temperatures (“TX”)
We tried with \(\approx\) all the packages to check the results. Obviously, these are nearly the same.
library(evd)
library(extRemes)
library(ismev)
# Do it with several methods from different packages. Just to check if it is the same
NOMSG( gev_tx <- gev.fit(max_years$data) )
gev_tx1 <- fgev(max_years$data)
summary(gev_tx2 <- fevd(max_years$data, units = "deg C"))##
## fevd(x = max_years$data, units = "deg C")
##
## [1] "Estimation Method used: MLE"
##
##
## Negative Log-Likelihood Value: 251.7511
##
##
## Estimated parameters:
## location scale shape
## 30.5869550 2.0809482 -0.2543586
##
## Standard Error Estimates:
## location scale shape
## 0.21582146 0.15534602 0.06693779
##
## Estimated parameter covariance matrix.
## location scale shape
## location 0.046578904 0.003076151 -0.005833901
## scale 0.003076151 0.024132385 -0.006005115
## shape -0.005833901 -0.006005115 0.004480668
##
## AIC = 509.5023
##
## BIC = 517.7631
plot(gev_tx2) At first sight, fit seems OK
#ci(gev_tx2,type="parameter")
#ci(gev_tx2,type="parameter",which.par =3,
# xrange=c(-.4,.1),method="proflik",verbose=TRUE)
NOMSG( ci(gev_tx2, method="proflik", xrange=c(35,40), verbose = T) )#ci(gev_tx2, method="proflik", type = "parameter", which.par = 3, verbose=TRUE)
#it is often better to obtain confidence intervals from the profile-likelihood method in
#order to allow for skewed intervals that may better match the sampling df of these parameterWe can see that profile likelihood intervals are more preferable (here for return levels) because they can take into account the asymetry that is present in the likelihood surface for these kind of inferences (return levels, \(\xi\), \(\dots\)) We will see more about return levels in section 2.4
Even if one could see this as unnecessary here, we wanted to test (by LR) whether this is significant to have a bounded distribution for this process, or if the Gumbel (unbounded) could be preferable.
# Gumbel ? ie test if shape parameter is significant
gev_gumb <- fevd(max_years$data,type="Gumbel", units="deg C")
# plot(gev_gumb) # fit is well poorer
# plot(gev_gumb,"trace")
lr.test(gev_gumb,gev_tx2) # p-value=0.001 --> we reject H0 that xi=0##
## Likelihood-ratio Test
##
## data: max_years$datamax_years$data
## Likelihood-ratio = 10.184, chi-square critical value = 3.8415,
## alpha = 0.0500, Degrees of Freedom = 1.0000, p-value = 0.001417
## alternative hypothesis: greater
We see no evidence that Gumbel distribution could be preferable.
We wanted to implement our own own likelihood function and then optimize it manually to check if the results are the same. Moreover, it will be useful to do this for other applications (computing posterior in Bayesian analysis, Bootstrap, …)
gev.loglik <- function(theta, data){
y <- 1 + (theta[1] * (data - theta[2]))/theta[3]
if((theta[3] < 0) || (min(y) < 0)) {
ans <- 1e+06
} else {
term1 <- length(data) * logb(theta[3])
term2 <- sum((1 + 1/theta[1]) * logb(y))
term3 <- sum(y^(-1/theta[1]))
ans <- term1 + term2 + term3
}
ans
}
param <- c(mean(max_years$df$Max),sd(max_years$df$Max),
0.1 )
# 0.1 starting value for Xi is common (~ mean of all practical applications in meteo), or see Coles
dataset <- max_years$data
nlm(gev.loglik, param, data = dataset,
hessian=T, iterlim = 1e5)$estimate## [1] -0.2543569 30.5869360 2.0809473
Comparing with the ones above, it is OK !!!
Hence, we wanted to compute what is the upper endpoind of this (Weibull-typed) distribution. We remember the upper endpoint is \(x_*=\mu + \sigma\cdot|\xi|^{-1}\)
upp_endpoint <- gev_tx1$estimate['loc'] -
gev_tx1$estimate['scale'] / gev_tx1$estimate['shape']
upp_endpoint ; max(max_years$data) # OK greater than the maximum value of the data## loc
## 38.76715
## [1] 36.6
# estimated proba of exceeding this value will be exactly 0. : Bound for return levelsWe see that the sample properties of this model take into account the large uncertainty from the fact that there is only 116 years of historic, and thus it permits to go beyond this maximum value (with very small probability), as we have seen with our profile-likelihood ci for return levels.
library(fExtremes)
gev_pwm <- gevFit(max_years$data, type = "pwm")
gev_lmom <- fevd(max_years$data, units = "deg C", method = "Lmoments")
gev_gmle <- fevd(max_years$data, units = "deg C", method = "GMLE" )
gev_bay <- fevd(max_years$data, units = "deg C", method = "Bayesian" )If we look at the estimates (not shown here), they look relatively similar. However, we will get more on that later (Performance Simmulation ?)
Probability plot
# get order statistics
max_order <- sort(max_years$data)
# retrieve the the empirical distribution function
empirical= c()
for(i in 1:length(max_order)){
empirical[i] <- i/(length(max_years$data)+1)
}
# compute Distribution function of the modelled GEV
GEV.DF <- function(data,mu,sigma,xi){
if(xi == 0){
GEV <- exp(-exp(-((data-mu)/sigma)))}
else{
GEV <- exp(-(1+xi*((data-mu)/sigma))^(-1/xi))}
return(GEV)
}
model_est <- c()
for(i in 1:length(max_order)){
model_est[i] <- GEV.DF(max_order[i],gev_tx1$estimate[1],
gev_tx1$estimate[2],gev_tx1$estimate[3])
}
gg_pp <- ggplot(data = data.frame(empirical,model_est),
aes(x=empirical,y=model_est)) +
geom_point(shape = 1, col = "#33666C") + geom_abline(intercept=0,slope=1,col="red") +
theme_piss(16, 11) + ggtitle("Probability plot") QQ-plot
# Compute the quantile function (inverse of DF)
model_quantile <- length(max_order)
GEV.INV <- function(data, mu, sigma, xi){
if(xi==0){ INV <- mu - sigma * log(-log(1 - data)) }
else{ INV <- mu + (sigma/xi) * (((-log(data))^(-xi))-1) }
return(INV)
}
for(i in 1:length(max_order)){
model_quantile[i] <- GEV.INV(empirical[i], gev_tx1$estimate[1],
gev_tx1$estimate[2], gev_tx1$estimate[3])
}
gg_qq <- ggplot(data=data.frame(model_quantile,max_order), aes(x=max_order,y=model_quantile)) +
geom_point(shape = 1, col = "#33666C") + geom_abline(intercept=0,slope=1,col="red") +
theme_piss(16, 11) + ggtitle("QQ-plot")
# Same conclusion
grid.arrange(gg_qq,gg_pp, nrow = 1)Fit seems quite well for this model.
And what about the Return Levels ?
# 100-year return level ?
y_100 <- -(1 / log(1 - 1/100))
r_m100 <- gev_tx1$estimate[1] + (gev_tx1$estimate[2] / gev_tx1$estimate[3]) *
(y_100^gev_tx1$estimate[3] - 1)
# see eq. in the text --> here we take the estimate
r_m100 <- GEV.INV(1 - 1/100, gev_tx1$estimate[1], gev_tx1$estimate[2], gev_tx1$estimate[3])
# or directly by our Inverse function at survival probability. Same resultSo we expect that this is the value of temperature which will be exceeded on average every 100 years.
We show the output for several return periods :
return.level(gev_tx2, return.period = c(2, 10, 100, 10000))## fevd(x = max_years$data, units = "deg C")
## return.level.fevd.mle(x = gev_tx2, return.period = c(2, 10, 100,
## 10000))
## [1] "GEV model fitted to max_years$data (deg C)"
## Data are assumed to be stationary
## [1] "Return Levels for period units in years"
## 2-year level 10-year level 100-year level 10000-year level
## 31.31518 34.15255 36.22917 37.98218
First, beware that these assume that data are stationary, while they are probably not (?). We remark clearly that these inferences on return levels do not take into account the (probable) climate warming, i.e. the fact that mean temperature increases slightly over time AND also the fact that extremes are more frequent in a climate change, \(\dots\) It is problematic and we must take into account this reality. Moreover, from the shape of the return level plot (with \(\xi<0\), see below), the slope is decreasing over time, see the plot below which is probably not one could expect for return levels (…)
Another problem we must mention is the poor ability of prediction beyond the range the data. Having only 116 years of data, the model will be poor to estimate something which is expected to arise in 10000 years. Indeed, one shall not expect a TX of 38deg reached only after 10,000 years… But the aim is not to predict over a so large range.
# Standard errors of the estimates by hand
m <- 20 # return period
r_m <- -log(1 - (1/m))
nabla <- matrix(ncol = 1, nrow = 3)
nabla[1,1] <- 1
nabla[2,1] <- -((gev_tx1$estimate[3])^(-1))*(1-(r_m^(-gev_tx1$estimate[3])))
nabla[3,1] <- ((gev_tx1$estimate[2])*
((gev_tx1$estimate[3])^(-2))*(1-((r_m)^(-gev_tx1$estimate[3]))))-
((gev_tx1$estimate[2])*((gev_tx1$estimate[3])^(-1))*
((r_m)^(-(gev_tx1$estimate[3])))*log(r_m))
#sqrt(t(nabla)%*%gev_tx1$var.cov%*%nabla)
# Retrieve return period associated to prob of exceeding thresold from the fitted GEV
proba_excess <- pextRemes(gev_tx2, q = c(25, 30, 35, 38, upp_endpoint),
lower.tail = F)
(proba_excess)^-1## [1] 1.000435e+00 1.367953e+00 2.157497e+01 1.094352e+04 3.002400e+15
These represent the probability of exceeding (= return periods) respectively 25, 30, 35, 38 and \(x_*=38.77\). We see the huge difference between the two last (by factor \(\approx 10^{11}\) ) because they locate in the (right) tail of the Weibull fitted process which is (very) light-tailed (and bounded), see e.g. first where the bound is less “restrictive” here (\(\hat{\xi}\approx -0.25>-0.5\)) and thus the tail goes a bit more far beyond (…)
sum(max_years$data > 35)## [1] 4
Only 4 years have reached 35deg. The return period of 21.6 years found above for 35deg seems thus coherent.
# See our function built in UsedFunc.R for ggplot
gg_rl <- rl_piss(gev_tx$mle, gev_tx$cov, gev_tx$data)
## Density plots
x <- seq(min(max_years$data)-5, max(max_years$data)+5, length = length(max_years$data))
weib_fit_dens <- evd::dgev(x,loc = gev_tx$mle[1],
scale = gev_tx$mle[2], shape = gev_tx$mle[3])
gg_ds <- ggplot(data.frame(x,weib_fit_dens)) + stat_density(aes(x = max_years$data), geom = "line") + ggtitle("Empirical (black) vs fitted (red) density") +
geom_line(aes(x = x, y = weib_fit_dens), colour = "red") + theme_piss(size_p = 5) +
coord_cartesian(xlim = c(25, 38)) +
geom_vline(xintercept = min(max_years$data), linetype = 2) +
geom_vline(xintercept = max(max_years$data), linetype = 2) + theme_piss()
# Red is the fitted density while black is the empirical one.
grid.arrange(gg_rl, gg_ds, nrow = 1)#gev.diag(gev_tx) # All-in-One from package *Ismev*Confidence bands seem not too large in the return level plot, but it increases with return period (uncertainty), especially above the range of data (116 here). We see the decreasing slope (Weibull-typed) discussed above, typical for temperature data. (…)
The second plot compare the fitted with the empirical density. We see that the fit seems relatively well. We also added vertical dotted lines representing the minimum and maximum yearly value for TX. We see that both process go beyond this range (…) Why is this for the empirical curve ??
Profile likelihoods intervals for parameters :
#### Profile likelihood ( For stationary model !!! )
#gev.prof(gev_tx, 20, xlow = min(max_years$data)+7, xup = max(max_years$data))
#gev.profxi(gev_tx,xlow=-1,xup=2)
par(mfrow=(c(1,3)))
NOMSG( plot(profile(gev_tx1), ci=c(0.95,0.99)) ) (See source of gev.profxi(), ismev)
Note :
logLik(gev_tx1)[1] -0.5 * qchisq(p = 0.95, df = 1)## [1] -253.6719
logLik(gev_tx1)[1] -0.5 * qchisq(p = 0.99, df = 1)## [1] -255.0686
Inded, we remark that it matches with the plots. Dotted lines are at the same ordinate for the 3 plots, but using different scales.
For doing the analysis, we will rely mainly on POT package. It is a bit old … but it still does the job.
library(POT)Mean Residual Life plot
max_all <- TXTN_closed$TX
# u <- seq(min(max_all),max(max_all),0.01)
# x <- c()
# for (i in 1:length(u)) {
# threshold.excess <- max_all[max_all>u[i]]
# x[i] <- mean(threshold.excess-u[i])
# }
# plot(x~u,type="l",main="MRL plot",ylab="mean excess", xlim = c(28,38), ylim = c(0,3))
## Several methods to do this plot, from packages or by hand ....
# mrlplot(max_all)
# mrl.plot(max_all)
# MeanExcess(max_all, plot = T, k =1000)
mrl.plot(max_all, umin = 20, umax = max(max_all)) # Explore valuesWe notice the expected decreasing behaviour for this light-tailed distribution. We would go for 31-32deg as threshold (?), as we see \(\approx\) some linearity in mrl plot. But, this is really approximated and subjective…
Selection based on stability of the estimates : TCplot
par(mfrow = (c(2,1)))
threshrange.plot(max_all, r = c(28,35) )#tcplot(max_all, u.range = c(28, 35), nt = 50)With these plots we could go for 32-33deg as the estimations of \(\sigma\) and \(\xi\) are \(\approx\) stable until this. Combining both methods, we would thus go for 32deg.
Dispersion Index (DI) Plot
First we must “decluster” the series with clust() to prevent from strong autocrrelations in this kind of series. We talk more on (de)clustering in section 4.
data.di <- data.frame(time = seq(1:length(max_all)), obs = max_all)
events <- clust(data.di, u = 26, tim.cond = 20/365, clust.max = TRUE)
#diplot(events[,-3], u.range = c(28,36) )
diplot_fast(events[,-3], u.range = c(25,35), nt = 1000 )We have rearranged the original diplot() because there were speed problems (too much iterations in the loop, etc…)
In theory, if DI is significantly close to 1, the sample can be modelled by a Poisson process and the correspondent threshold is then not rejected.
So here, we can definitively not decide from this plot (or an error occured during the computation…). Even with a tedious trial-and-error, we did not found anything sastifying.
L-moments Plots
NOMSG( lmomplot(max_all, u.range = c(25, 35), identify = T) )As communicated by (???), creator of the package, the interpretation of this plot is really tedious.
Conclusion from all these methods :
From this, one could consider climatological-based threshold (30deg, which is natural in meteorology to denote hot temperature) OR consider other modelling (varying threshold, mixtures,..) \(\Rightarrow\) see later
NOMSG( fit_pot1 <- gpd.fit(max_all, 30) )
fit_pot <- fevd(max_all,threshold = 30, type="GP") ## They use MLE by default
# pextRemes(fit_pot,c(25,30,35,36), lower.tail = FALSE)
gpd_mom <- fitgpd(max_all, 30, "moments")
gpd_mle <- fitgpd(max_all, 30, "mle")
gpd_pwmu <- fitgpd(max_all, 30, "pwmu")
gpd_pwmb <- fitgpd(max_all, 30, "pwmb")
gpd_pickands <- fitgpd(max_all, 30, "pickands")
gpd_med <- fitgpd(max_all, 30, "med", start = list(scale = 2, shape = 0.1))
gpd_mdpd <- fitgpd(max_all, 30, "mdpd") # mean power density divergence
gpd_mple <- fitgpd(max_all, 30, "mple")
gpd_ad2r <- fitgpd(max_all, 30, "mgf", stat = "AD2R") # maximum goodness-of-fit with modified Anderson Darling statistic
print(rbind(mom = gpd_mom$param, mle = gpd_mle$param,pwm.unbiased = gpd_pwmu$param, pwm.biased = gpd_pwmb$param, pickands = gpd_pickands$param, median = gpd_med$param, mdpd = gpd_mdpd$param, MpenalizedLE = gpd_mple$param, ad2r = gpd_ad2r$param))## scale shape
## mom 2.249022 -0.3438105
## mle 2.138026 -0.2848215
## pwm.unbiased 2.291331 -0.3690906
## pwm.biased 2.295071 -0.3713250
## pickands 2.378591 -0.4854268
## median 2.230973 -0.2927716
## mdpd 2.154409 -0.2910819
## MpenalizedLE 2.139898 -0.2853814
## ad2r 2.033462 -0.2484262
There is variability in the results among the different methods available (to study…). Comparing the shape parameter estimation with this obtained with GEV, it is a bit more small. (must decrease threshold ?)
Note that Estimators based on extreme order statistics are Pickands and Moment.
gpd.diag(fit_pot1) #; plot(fit_pot) OK
mleci <- gpd.fiscale(gpd_mle, conf = 0.9)
momci <- gpd.fiscale(gpd_mom, conf = 0.9)
pwmuci <- gpd.fiscale(gpd_pwmu, conf = 0.9)
pwmbci <- gpd.fiscale(gpd_pwmb, conf = 0.9)
print(rbind(mleci,momci,pwmuci,pwmbci))## conf.inf.scale conf.sup.scale
## mleci 1.902266 2.373786
## momci 1.951365 2.546679
## pwmuci 1.961874 2.620789
## pwmbci 1.965046 2.625096
shape_fi <- gpd.fishape(gpd_mle)
shape_pf <- gpd.pfshape(gpd_mle, range = c(-0.4, -0.1), conf = c(.95))Pay attention to the weird shape of profile \(\ell(\cdot)\) for shape.
gev_pp <- fevd(max_years$data, units = "deg C", threshold = 32,
type = "PP", verbose = T)##
## Data span 0.3148528 years
## Setting up parameter model design matrices.
## Parameter model design matrices set up.
## Initial estimates found where necessary (not from Lmoments). Initial value = -104.1134
## Initial estimates are:
## location scale shape
## 37.44072579 1.10333110 0.00000001
## Beginning estimation procedure.
##
## First fitting a GP-Poisson model in order to try to get a good initial estimate as PP likelihoods can be very unstable.
## initial value 68.858543
## iter 10 value 61.176729
## final value 61.176720
## converged
##
## Sticking with originally chosen initial estimates.
## initial value -104.113442
## iter 10 value -110.454630
## iter 20 value -111.792068
## final value -111.795252
## converged
## Time difference of 0.02575207 secs
# plot(gev_pp)
## Beware : For non-stationary models, the density plot is not provided.
# plot(gev_pp, "trace")
# ci(gev_pp, type = "parameter")
suppressWarnings(NOMSG( threshrange.plot(max_all, r = c(27, 34), nint = 20, type = "PP") ) )Etc \(\dots\)
32deg seems again a good choice regarding parameter stability estimates (\(\dots\)).
We will see more useful applications in the follwoing (especially in the (non)stationnary context)
Not relevant to do POT for whole days in year. We saw that max in winter never went a yearly TX, etc…
\(\Rightarrow\) Can’t we make better analysis on divided datasets ?
DIVIDING BY WINTER-SUMMER
Here we model considering “winter months” (from october to march, ie from month 10 to 03) and summer months (from april to october, ie [04-10])
We commented outputs on some descriptive statistics, etc… for convenience. But we let those available if one wants to retrieve these interesting results
TXTN_closed_s <- TXTN_closed[TXTN_closed$month %in% c(4, 5, 6, 7, 8, 9),]
TXTN_closed_w <- TXTN_closed[TXTN_closed$month %in% c(1, 2, 3, 10, 11, 12),]
# describe(TXTN_closed_s)
# describe(TXTN_closed_w)
TXTN_closed_s <- data.frame (TXTN_closed_s, period = "1")
TXTN_closed_w <- data.frame (TXTN_closed_w, period = "0")
TXTN_closed_ws <- rbind(TXTN_closed_w, TXTN_closed_s)
# ggplot(TXTN_closed_ws, aes(x = TX, col = period)) +
# geom_density() + theme_bw()
gX <- ggplot(TXTN_closed_ws, aes(x = period, y = TX)) + geom_boxplot()
gN <- ggplot(TXTN_closed_ws, aes(x = period, y = TN)) + geom_boxplot()
# grid.arrange(gX, gN, nrow = 1)
## Retrieve the pertaining datasets w and s, enabling compute max/min for "seasons" (w or s)
list_years_w <- split(TXTN_closed_w,TXTN_closed_w$year)
list_years_s <- split(TXTN_closed_s,TXTN_closed_s$year)
###### WINTER
max_w <- yearly.extrm(list_years_w)
# Replace the last value which is too low as TX for winter did not occur
max_w$df[116,]$Max <- mean(max_w$df$Max[110:115])
# summary(lm_w <- lm(max_w$df$Max ~ max_w$df$Year))
#
# ggplot(data = max_w$df, aes(x = Year,y = Max)) + geom_point() +theme_bw()
# ggplot(data = max_w$df, aes(x = Max)) + geom_histogram() +theme_minimal()
# ggplot(data = max_w$df, aes(x = Max)) + geom_density() +theme_bw()
##### SUMMER
max_s <- yearly.extrm(list_years_s)
# summary(lm_s <- lm(max_s$df$Max ~ max_s$df$Year))
## As expected, we remark that trend is heavier and "more significant"
## in summer months for TX than in winter months
# ggplot(data = max_s$df,aes(x = Year, y = Max)) + geom_point() +theme_bw()
# ggplot(data = max_s$df,aes(x = Max)) + geom_histogram() +theme_minimal()
# ggplot(data = max_s$df,aes(x = Max)) + geom_density() +theme_bw()
## Comparisons
ggw <- ggplot(data = max_w$df, aes(x = Year, y = Max)) + geom_line(size = 0.3) +
geom_smooth(method='lm',formula = y~x, size = 0.5) +
stat_smooth(method = "loess", se = F, col = 'red', size = 0.5) +
ggtitle("TX For Winter months [10-03]") +
theme_piss(14, 12) +
theme(legend.title = element_text(colour="#33666C",
size=12, face="bold")) +
theme(axis.text.x = element_text(size = 5)) +
theme(axis.text.y = element_text(size = 5))
ggs <- ggplot(data = max_s$df ,aes(x = Year,y = Max)) + geom_line(size = 0.3) +
geom_smooth(method = 'lm',formula = y~x, size = 0.5) +
stat_smooth(method = "loess", se = F, col = 'red', size = 0.5) +
ggtitle("TX For Summer months [04-09]") +
theme_piss(14, 12) +
theme(legend.title = element_text(colour="#33666C",
size=12, face="bold")) +
theme(axis.text.x = element_text(size = 5)) +
theme(axis.text.y = element_text(size = 5))
################### For MINIMA ##############################
## Retrieve the minmimum for datasets w and s
## "Winter""
min_w <- yearly.extrm(list_years_w, Fun = min, tmp = 'TN')
# summary(lm_w_min <- lm(min_w$df$Min ~ min_w$df$Year))
#
# ggplot(data =min_w$df ,aes(x = Year, y = Min)) + geom_line() +
# geom_smooth(method = 'lm',formula = y~x) + theme_bw() +
# stat_smooth(method = "loess", se = F, col = 'red')
#
# ggplot(data = min_w$df, aes(x = Year, y = Min)) + geom_point() + theme_bw()
# ggplot(data = min_w$df, aes(x = Min)) + geom_histogram() + theme_minimal()
# ggplot(data = min_w$df, aes(x = Min)) + geom_density() + theme_bw()
## "Summer""
min_s <- yearly.extrm(list_years_s, Fun = min, tmp = "TN")
# summary(lm_s_min <- lm(min_s$df$Min ~ min_s$df$Year))
# ggplot(data = min_s$df, aes(x = Year,y = Min)) + geom_point() +theme_bw()
# ggplot(data = min_s$df, aes(x = Min)) + geom_histogram() +theme_minimal()
# ggplot(data = min_s$df, aes(x = Min)) + geom_density() +theme_bw()
gmin.w <- ggplot(data =min_w$df ,aes(x = Year, y = Min)) + geom_line(size = 0.3) +
geom_smooth(method = 'lm', formula = y~x, size = 0.5) +
stat_smooth(method = "loess", se = F, col = 'red', size = 0.5) +
ggtitle("TN for winter months [10-03]") +
theme_piss(14, 12) +
theme(legend.title = element_text(colour="#33666C",
size=12, face="bold")) +
theme(axis.text.x = element_text(size = 5)) +
theme(axis.text.y = element_text(size = 5))
gmin.s <- ggplot(data = min_s$df, aes(x = Year, y = Min)) + geom_line(size = 0.3) +
geom_smooth(method = 'lm', formula = y~x, size = 0.5) +
ggtitle("TN for summer months [04-09]") +
stat_smooth(method = "loess", se = F, col = 'red', size = 0.5) +
theme_piss(14, 12) +
theme(legend.title = element_text(colour="#33666C",
size=9, face="bold")) +
theme(axis.text.x = element_text(size = 5)) +
theme(axis.text.y = element_text(size = 5))
#### All
grid.arrange(ggw, ggs,gmin.w, gmin.s, nrow = 2)var(max_s$data) ; var(max_w$data) ; var (min_s$data) ; var(min_w$data)## [1] 4.617603
## [1] 5.484782
## [1] 2.446635
## [1] 12.71684
We see that the variance well more for minima in winter than in summer months, etc…
Here, we have plotted the yearly extremes, so it is relevant regarding an analysis with GEV. However, if we plot the complete series (for POT) we see that there is still a seasonnal (non-stationnary) pattern. (see later)
## Or more convenient to work with NEGATED data for TN, for further analysis
neg_min_s <- min_s$df ; neg_min_s$negMin <- -neg_min_s$Min
neg_min_w <- min_w$df ; neg_min_w$negMin <- -neg_min_w$Min
gnegmin.w <- ggplot(data = neg_min_w ,aes(x = Year, y = negMin)) + geom_line() +
geom_smooth(method = 'lm', formula = y~x) + theme_bw() + theme_piss(14, 12) +
stat_smooth(method = "loess", se = F, col = 'red') + ggtitle("Min for winter months")
gnegmin.s <- ggplot(data = neg_min_s, aes(x = Year, y = negMin)) + geom_line() +
geom_smooth(method = 'lm', formula = y~x) + ggtitle("Min for summer months") + theme_piss(14, 12) +
stat_smooth(method = "loess", se = F, col = 'red')
# grid.arrange(gnegmin.w, gnegmin.s, nrow = 2)
#################
# As expected, the trend is (slightly) heavier for TX in warm months
# and heavier for TN in cold month. Test if it is really significant
# summary(glm(TXTN_closed_ws$TX ~ max_s$df$Year * TXTN_closed_ws$period))
# summary(lm(TXTN_closed_ws$TX~TXTN_closed_ws$period * TXTN_closed_ws$year))
# summary(lm(TXTN_closed_ws$TX~TXTN_closed_ws$period))Well, there is a plenty of different ways of dealing with the analysis.
Again, one could want to test (difference in slopes,…) between some of these results. For example, if slope is heaier in TX of summer months or TN of winter months, …
Division on the 4 real seasons separately Plot of the first 1000 obserations of the whole series :
# ggplot(TXTN_closed[1:2000, ]) + geom_point(aes(x = Date, y = TX)) # change value
# As we have seen, strong reccurent pattern occurs
TXTN_cl_wint <- TXTN_closed[TXTN_closed$season == "Winter", ]
TXTN_cl_spring <- TXTN_closed[TXTN_closed$season == "Spring", ]
TXTN_cl_summ <- TXTN_closed[TXTN_closed$season == "Summer", ]
TXTN_cl_autu <- TXTN_closed[TXTN_closed$season == "Autumn", ]
# Add column which allows to retrieve the index for plotting
TXTN_cl_wint <- cbind(TXTN_cl_wint, index = 1:nrow(TXTN_cl_wint))
TXTN_cl_spring <- cbind(TXTN_cl_spring, index = 1:nrow(TXTN_cl_spring))
TXTN_cl_summ <- cbind(TXTN_cl_summ, index = 1:nrow(TXTN_cl_summ))
TXTN_cl_autu <- cbind(TXTN_cl_autu, index = 1:nrow(TXTN_cl_autu))
ga <- ggplot(TXTN_cl_autu[1:1000,]) + geom_point(aes(x = index, y = TX)) +
ggtitle("TX For autmun") + theme_piss(14,12)
gw <- ggplot(TXTN_cl_wint[1:1000,]) + geom_point(aes(x = index, y = TX)) +
ggtitle("TX for winter") + theme_piss(14,12)
gs <- ggplot(TXTN_cl_spring[1:1000,]) + geom_point(aes(x = index, y = TX)) +
ggtitle("TX for spring") + theme_piss(14,12)
gsu <- ggplot(TXTN_cl_summ[1:1000,]) + geom_point(aes(x = index, y = TX)) +
ggtitle("TX for summer") + theme_piss(14,12)
# (change)
grid.arrange(ga, gw, gs, gsu)# Not sufficient to model seasons separatelyHere we see that the reccurrent seasonnal pattern still holds (\(\Rightarrow\) non-stationnarity) and it is more strong for autumn and spring (intuitive, these are “transition months”)
Warmest months for TX : July and Augustus and
Coolest months for TN : Januari and Februari
We represent here the first 2000 observations of the whole series :
txtn_cl_warm <- TXTN_cl_summ[TXTN_cl_summ$month %in% c(7,8), -8 ]
txtn_cl_warm <- cbind(txtn_cl_warm, index = 1:nrow(txtn_cl_warm))
gw <- ggplot(txtn_cl_warm[1:2000,]) + geom_line(aes( x = index, y = TX)) +
ggtitle("TX for July and August") +
geom_hline(yintercept = mean(txtn_cl_warm$TX), col= "red") + theme_piss()
list_years_w <- split(txtn_cl_warm, txtn_cl_warm$year)
yearl_ext_warm <- yearly.extrm(list_years_w)
#summary( gev_warm <- fevd(yearl_ext_warm$data, units = "deg C") )
######### Coolest months for TN (Januari and februari) #############
######################
txtn_cl_cool <- TXTN_cl_wint[TXTN_cl_wint$month %in% c(1,2), -8 ]
txtn_cl_cool <- cbind(txtn_cl_cool, index = 1:nrow(txtn_cl_cool))
gc <- ggplot(txtn_cl_cool[1:2000, ]) + geom_line(aes( x = index, y = TN)) +
ggtitle("negated TN for Januari and Februari") +
geom_hline(yintercept = mean(txtn_cl_cool$TN), col= "red") + theme_piss()
### Change sign of the values and work with maxima !
tn_cl_cool_neg <- txtn_cl_cool
tn_cl_cool_neg$TN <- -(tn_cl_cool_neg$TN)
list_years_c <- split(tn_cl_cool_neg, tn_cl_cool_neg$year)
yearl_ext_cool <- yearly.extrm(list_years_c, Fun = min, tmp = "TN")
#summary( gev_cool <- fevd(yearl_ext_cool$data, units = "deg C") )
grid.arrange(gw, gc, nrow = 1)Here, we remark with this method that assumption of stationarity is much more fulfilled (at least, regarding seasonnality). However, we must also think about the loss of information …
For all these results, it is for sure interesting to see and to compare the results of a POT or GEV fitting to these sub-analysis (not shown here).
Example code : Results are hidden
###############################################################################
################## POT taken on divided datasets ########################
#############################################################################
library(fExtremes)
TX_all_s <- TXTN_closed_s$TX
TX_all_w <- TXTN_closed_w$TX
############# max TX on summer dataset
threshrange.plot(TX_all_s, r = c(25,35))
# mean residual life plot
u <- seq(min(TX_all_s),max(TX_all_s),0.01)
x <- c()
for (i in 1:length(u)) {
threshold.excess <- TX_all_s[TX_all_s > u[i] ]
x[i] <- mean(threshold.excess - u[i])
}
plot(x~u,type="l",main="MRL plot",ylab="mean excess")
mrl.plot(TX_all_s) # Decreasing : we have LTE distribution (xi negative)
# Let's go for 32-34deg as threshold, as see ~some linearity in mrl plot
above_thres_s <- TX_all_s[TX_all_s>32]
fit_pot_s_1 <- gpd.fit(TX_all_s,32)
gpd.diag(fit_pot_s_1)
fit_pot_s_11 <- fevd(TX_all_s,threshold = 32,type="GP")
plot(fit_pot_s_11)
pextRemes(fit_pot,c(25,30,35,36),lower.tail = FALSE)
##########################
par(mfrow=c(2,1))
acf(above_thres_s,lag.max = length(above_thres_s))
pacf(above_thres_s,lag.max = length(above_thres_s))
# This clearly increase when we decrease the threshold...
plot(above_thres_s[1:length(above_thres_s)-1],above_thres_s[2:length(above_thres_s)])
# Very slight sign of dependence when we take only summer months...Now, let’s go further with the problem of dependence.
Now, we relax the indepence assumption ( with \(D(u_n)\) condition, …)
First, we check the autcorrelations in the series (hidden results)
## Detect autocorrelation/dependence for GEV
acfpacf_plot(max_years$data) # See our source code for this function
# We can detect/suspect the seasonality with the ~harmonic oscillation
# For block maxima,as expected, autocorrelation is weak (but present!)
plot(max_years$data[1:length(max_years$data) - 1],
max_years$data[2:length(max_years$data)])
# Seems really independent. For GEV, it is "okay"
## For POT
acfpacf_plot(above_30$TX)
# Auto-Tail Dependence Function : see extReme 2.0 pdf (for bivariate series...)
atdf(max_all,0.99) # O.95 is u in F^-1(u) over which we compute atdf
# u close to 1 but high enough to incorporate enough data. Then, we have remarked that there is really a clustering of the exceedances.
extremalindex(max_all, threshold = 30) # Method interval was from Segers !!##
## Intervals Method Estimator for the Extremal Index
## NULL
##
## theta.tilde used because there exist inter-exceedance times > 2.
## extremal.index number.of.clusters run.length
## 0.4166258 127.0000000 12.0000000
# obviously, the dependence increase (= theta decreases) as threshold decreases because there are more extremesThe extremes are expected to cluster by group of mean size \(0.42^{-1}\approx 2\)
We represent this by the series above the threshold of 30deg :
above_30 <- TXTN_closed[TXTN_closed$TX>30,]
ggplot(above_30, aes(x = Date, y = TX)) + geom_point() + theme_piss(25,14) +
labs(title = "TX above 30°c") +
theme(axis.line = element_line(color="#33666C", size = .45)) +
geom_vline(aes(xintercept = as.numeric(as.Date("1911-07-27"))),
col = "red", size = 0.15) +
geom_vline(aes(xintercept = as.numeric(as.Date("1976-06-30"))),
col = "red", size = 0.15) We can clearly see that the clustering occurs. We found interesting to highlight that with 2 heat waves occuring in 1911 and 1976. It is shown with the red lines where we see lot of points lying on this line
Point Process model allows to control for the cluster max
pot_control_pp <- fpot(max_all, model = "pp",threshold = 30, cmax = T, r = 0,
start = list(loc = 30, scale = 1, shape = 0), npp = 365.25 )We test for a linear trend in the location parameter \(\mu\).
## Intro
# lmm <- lm(TXTN_closed$TX ~ TXTN_closed$Date * TXTN_closed$season)
# lm0 <- lm(TXTN_closed$TX ~ TXTN_closed$season )
# lm1 <- lm(TXTN_closed$TX ~ TXTN_closed$season + TXTN_closed$Date)
#
# df <- data.frame(obs = TXTN_closed$TX[1:1500], season = predict(lm0)[1:1500],
# season_trend = predict(lm1)[1:1500], interac = predict(lmm)[1:1500],
# Date = TXTN_closed$Date[1:1500])
#
# ggplot(df) + geom_point(aes(x = Date, y = obs)) +
# geom_line(aes(x = Date, y = season, colour = "red")) +
# geom_line(aes( x = Date, y = season_trend, colour = "blue")) +
# geom_line(aes(x = Date, y = interac, colour = "green"))
############### GEV ###########################
# We have seen this reccurent pattern... First, simple linear trend ?
# As we have seen with the LR on all the TX, we expect this to be
# significatie here too (?!)
ti <- matrix (ncol = 1, nrow = length(max_years$data))
ti[ ,1] <- seq(1, length(max_years$data),1)
NOMSG ( gev_nonstatio <- gev.fit(max_years$data, ydat = ti , mul = 1) )
# Value of b_1 is ~~the same than this obtained with linea regression
NOMSG( gev_statio <- gev.fit(max_years$data) )
khi.obs <- 2 * ( (-gev_nonstatio$nllh[1]) - (-gev_statio$nllh[1]) )
q05 <- qchisq(.05, df = 1, lower.tail = F)
# It is significant. P-val ?
pval <- pchisq(khi.obs, df = 1, lower.tail = F)
require(pander)
pander(data.frame(khi.obs = khi.obs, quantileChi2 = q05, p.valeur = pval))| khi.obs | quantileChi2 | p.valeur |
|---|---|---|
| 19.89 | 3.841 | 8.217e-06 |
Not exactly the same as with LR, but same result : we (“highly”) prefer the linear trend model compared with the stationnary model.
More complex model ?
ti2 <- matrix(ncol = 2, nrow = length(max_years$data))
ti2[ ,1] <- seq(1, length(max_years$data), 1)
ti2[ ,2] <- (ti2[,1])^2
NOMSG( gev_nonstatio2 <- gev.fit(max_years$data, ydat = ti2, mul = c(1, 2)) )
pval2 <- pchisq(2 *( (-gev_nonstatio2$nllh[1]) - (-gev_nonstatio$nllh[1]) ), df = 1,
lower.tail = F)
pander(data.frame(p.valeur = pval2, significance = "NO"))| p.valeur | significance |
|---|---|
| 0.4196 | NO |
Comparing with the \(\chi^2\) as above, it is not statisticaly necessary to have more complex model like this.
It is the same result for higher terms in the polynomial. It is not useful to increase complexity in this manner. However, we will see in section 5. if there would not be more complex nonlinear patterns.
Diagnostics of the retained model
#gev_nonstatio$mle
gev.diag(gev_nonstatio)Problem for high quantiles (as “usual”)
Return levels, which are the for model with linear trend. m = 10
rl_10_lin <- return.lvl.nstatio(max_years$df$Year, # See UsedFunc.R
gev_nonstatio, t = 500, m = 10)We Note the Increase of the return level with time (due to trend) With this kind of trend, in 300 years we will " expect to exceed the value of 40deg every 10 years in Uccle“. Or in 100 years for the value of 35deg
However, caution should be exercised in practice concerning whether or not it is believable for the upward linear trend in minimum temperatures to continue to be valid
And what if we started the analysis at year (around) 1965 ? When the data became more reliable and the global warming started to “appear” ….
ti_65 <- matrix (ncol = 1, nrow = 51)
ti_65[ ,1] <- seq(66, 116, 1)
NOMSG( gev_nstatio_65 <- gev.fit(max_years$data[66:116], ydat = ti_65 , mul = 1) )
rl_10_lin65 <- return.lvl.nstatio(max_years$df$Year[66:116],
gev_nstatio_65, t = 500, m = 10) # 1UsedFun.RHere, the interpretation is : for a return period of 10 years, the expectation of 40deg will be reached after less than 200 years…
However, we recall again that it is not reliable to make so far inferences with a so small amount of data. This is just to higlight the fact that a simple linear trend model is a bit weak… And some adjustments has to be made.
These examples also highlight the fact that return levels are not very accurate in a (simple) nonstationnary context (…)
Trend + varying scale parameters
We use an exponential link to ensure \(\sigma>0\).
ti_sig <- matrix(ncol = 2, nrow = length(max_years$data))
ti_sig[,1] <- ti_sig[,2] <- seq(1, length(max_years$data), 1)
NOMSG( gev_nstatio3 <- gev.fit(max_years$data, ydat = ti_sig , mul = 1,
sigl = 2, siglink = exp) )
pchisq(2 * ( (-gev_nstatio3$nllh[1]) - (-gev_nonstatio$nllh[1]) ), df = 1,
lower.tail = F)## [1] 1
P-value tells us that no reasons to vary scale param with time. But Problem in the p-value : complex model with dependent \(\sigma\) is not correct. Investigate but result will be the same.
Seasonal model
ti_seas <- matrix(ncol = 2, nrow = length(max_years$data))
ti_seas[ ,1] <- seq(1, length(max_years$data), 1)
ti_seas[ ,2] <- rep(1:4, length(max_years$data)/4)
t <- 1:length(TXTN_closed$TX)
# seas_effect <- sin(2 * pi * t / 365.25)
# summary(lm_seas_sin <- lm(TXTN_closed$TX ~ seas_effect))
## Better to use nonlinear model But Beware of the complexity....
nls.mod <- nls(max_all ~ a + b * cos(2*pi*t/365.25 - c),
start = list(a = 1, b = 1, c = 1))
co <- coef(nls.mod)
f <- function(x, a, b, c) {a + b*cos(2*pi*x/365.25 - c) }
ggplot(cbind.data.frame(TXTN_closed[1:5000, ], pred = predict(nls.mod)[1:5000])) +
geom_point(aes(x = Date, y = TX), size = 0.4) + geom_line(aes(x = Date, y = pred), col = "red") +
ggtitle("5000 first obs. with non linear model (red) for the seasons") + theme_piss(15,14)We see that this “model” seems perfect to capture the seasonal effect in our series.Further, there are many possibilities to use that.
Varying Threshold
Here, we will use the nonlinear model which seems accurate for modelling the seasons. Furthermore, one can easily assume these seasonnal effects are constant over time.
u <- predict(nls.mod) # see the seasonal model above
NOMSG( gpd_varu <- fevd(TX, TXTN_closed, threshold = u, verbose = T,
type = "GP", time.units = "365/year", units = "deg C") )
print(gpd_varu$results$par) ; print(gpd_varu$results$hessian)## scale shape
## 4.023540 -0.259597
## scale shape
## scale 4743.173 50844.76
## shape 50844.761 740747.02
While shape parameter \(\xi\) looks very similar (good news) to the previous modelling, scale parameter is slightly different. This is not surprizing while \(\sigma=\sigma_u\) is dependent of the threshold in a POT context.
above_u <- TXTN_closed ; above_u$exceed <- above_u$TX - u
above_u1 <- above_u[above_u$exceed>0,]
ggplot(above_u1[1:2000, ], aes(x = Date, y = TX)) + geom_point() + ggtitle("First 2000 residuals of the series from the nonlinear seasonnal model") + theme_piss(15, 13)nrow(above_u1)## [1] 20595
Lots of exceedances here (> 20000) and thus the seasonnal effects looks to still be there. (…)
Let’s now play with the “intercept” of the model
Heavier threshold in the seasonnal model
(Study bias-variance tradeoff wrt u for the choice)
We increase the threshold. as this allows us to manage the number of thresholds
nls.mod1 <- nls(max_all + 10 ~ a + b * cos(2 * pi * t / 365.25 - c),
start = list(a = 1, b = 1, c = 1))
above_u$exceed_red <- above_u$TX - predict(nls.mod1)
above_ured <- above_u[above_u$exceed_red > 0, ]
g1 <- ggplot(cbind.data.frame(TXTN_closed[1:5000, ], pred = predict(nls.mod1)[1:5000])) + ggtitle("Higher seasonnal varying threshold") + theme_piss(19, 14) +
geom_point(aes(x = Date, y = TX), size = 0.4) + geom_line(aes(x = Date, y = pred), col = "red")
g2 <- ggplot(above_ured, aes(x = Date, y = TX)) + geom_point() + ggtitle("Excesses from this model") + theme_piss(19,14) +
geom_smooth(method = "lm",formula = y~x) +
geom_vline(aes(xintercept = as.numeric(as.Date("1911-07-27"))),
col = "red", size = 0.3) +
geom_vline(aes(xintercept = as.numeric(as.Date("1976-06-30"))),
col = "red", size = 0.3)
grid.arrange(g1, g2, nrow = 2)Smaller excesses in the end, but heavier excess in the start. That is why this linear (no very informative) trend is slightly decaying.
Moreover, the problem of clustering is still present, but we can have now excesses coming from all seasons (including winter but more for autumns and spring) (investigate)
One can remark for example the same clusters that we noticed above (in red) representing the heat waves in 1911 and 1976 (…)
table(above_ured$year) ##
## 1903 1904 1906 1911 1912 1915 1917 1918 1919 1920 1921 1922 1923 1925 1926
## 1 1 2 10 1 1 1 2 2 1 8 3 2 1 2
## 1928 1929 1930 1931 1932 1934 1937 1938 1939 1940 1941 1942 1943 1944 1945
## 1 3 4 1 3 5 2 3 4 2 3 4 3 5 10
## 1946 1947 1948 1949 1950 1952 1953 1954 1955 1957 1959 1960 1961 1964 1966
## 3 17 1 7 2 4 1 1 1 3 6 2 5 3 1
## 1968 1969 1975 1976 1984 1985 1986 1988 1989 1990 1991 1992 1993 1994 1995
## 4 2 4 15 1 2 2 1 5 10 5 1 2 2 5
## 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## 4 1 9 1 1 4 2 8 2 7 8 12 1 4 3
## 2011 2012 2013 2014 2015 2016
## 13 7 4 9 5 5
nrow(above_ured)## [1] 319
We clearly see there are more extremes in the end of the series. The total number of exceedances (319) can be tuned.
NOMSG( gpd_varu_red <- fevd(TX, TXTN_closed, threshold = predict(nls.mod1),
verbose = T, type = "GP", time.units = "365/year", units = "deg C") )
# plot(gpd_varu_red)
NOMSG( gpd_varu_red <- gpd.fit(max_all, predict(nls.mod1)) ) ; gpd_varu_red$mle## [1] 1.3928317 -0.2066115
gpd.diag(gpd_varu_red)Again the problem for very high quantiles, but model seems okay. MLE’s are comparable with those previously obtained (varying with \(u\) for \(\sigma\) but constant in theory for \(\xi\)) ….
Declustering
We have seen the clusters of exceedances are still appearing in this model.
extremalindex(max_all, threshold = predict(nls.mod1), method = "intervals") ##
## Intervals Method Estimator for the Extremal Index
## NULL
##
## theta.tilde used because there exist inter-exceedance times > 2.
## extremal.index number.of.clusters run.length
## 0.3362382 108.0000000 72.0000000
Indeed, this is far from the independent series… and exceedances are expected to cluster by groups of mean size \(\theta^{-1}\approx\) 3 here !!!
# Here, in PP, value for threshold must be fixed....
# fpot(max_all, model= "pp", threshold = predict(nls.mod1), cmax = T, r = 0,
# start = list(loc = 30, scale = 1, shape = 0), npp = 365.25 )
invisible( unique(decl_varu <- clusters(max_all,u = predict(nls.mod1), cmax = T, rlow = 0)) )
# change value of parameters, this is probably not optimal.
df_decl <- data.frame(index = names(decl_varu), TX = unname(decl_varu))
retr_date <- TXTN_closed$Date[rownames(TXTN_closed) %in% as.character(df_decl$index) == T ]
df_decl <- cbind.data.frame(df_decl, Date = retr_date)
ggplot(df_decl, aes(x = Date, y = TX)) + geom_point() + ggtitle("Declustered series") + theme_piss(20, 14) + geom_vline(aes(xintercept = as.numeric(as.Date("1911-07-27"))),
col = "red", size = 0.3) +
geom_vline(aes(xintercept = as.numeric(as.Date("1976-06-30"))),
col = "red", size = 0.3)The series seems much more interesting now ! The clustering (see again red lines) and thus dependence is more present, but this seems well visually “better” now, in the sense that we use the whole series. We can also manage (hyper)parameters to obtain “better” results.
Investigate ! Moreover, the complexity of the model should not be appreciated. Overfitting, …
this is also possible to do these analysis in the PP framework. Packages enable to implement “”directly“” the (varying) threshold function. We let some results in the following. While there is nothing very new, we hide the output results. Some more improvements could be made from this ?
attach(TXTN_closed)
gev_pp <- fevd(Max, max_years$df, threshold = 32,
type = "PP", verbose = T, threshold.fun = I(year),
location.fun = I(year), time.units = "365/year")
detach(TXTN_closed)
TXTN_closed$t <- t
gev_pp1 <- fevd(TX, TXTN_closed, threshold = 32, location.fun =
~ cos(2 * pi * t / 365.25) + sin(2 * pi * t / 365.25),
type = "PP", units = "deg C")
lr.test(gev_pp, gev_pp1)
pp_coles <- fpot(max_all, model= "pp",threshold = 32,
start = list(loc = 30, scale = 1, shape = 0),
npp = 365.25 )
plot(pp_coles)
excess_coles <- fpot (max_all, threshold = 32, npp = 365.25)
# Same results for the shape, but no more location parameter
# since it conditions on the exceedances. Different scaleNow, let’s further in the difficult analysis of non-stationnarity. We want to be able to retrieve more complex relations than only linear, quadratic,… or any proposal function.
library(GEVcdn)
## 1) Define the hierarchy of the models by increasing complexity
models <- list()
# Stationary model
models[[1]] <- list(Th = gevcdn.identity,
fixed = c("location", "scale", "shape"))
# Linear models
models[[2]] <- list(Th = gevcdn.identity, fixed = c("shape","scale"))
models[[3]] <- list(Th = gevcdn.identity, fixed = c("shape"))
models[[4]] <- list(Th = gevcdn.identity)
# Nonlinear (with logistic link) with 1 or 2 hidden nodes
models[[5]] <- list(n.hidden = 1, Th = gevcdn.logistic, fixed = c("shape", "scale"))
models[[6]] <- list(n.hidden = 2, Th = gevcdn.logistic, fixed = c("shape", "scale"))
models[[7]] <- list(n.hidden = 1, Th = gevcdn.logistic, fixed = c("shape"))
models[[8]] <- list(n.hidden = 2, Th = gevcdn.logistic, fixed = c("shape"))
models[[9]] <- list(n.hidden = 1, Th = gevcdn.logistic)
models[[10]] <- list(n.hidden = 2, Th = gevcdn.logistic)
x <- as.matrix(seq(1, length(max_years$data)))
y <- as.matrix(max_years$data)
## 2) Fit the models and retrieve the weights
weights.models <- list()
for(i in seq_along(models)){
weights.models[[i]] <- gevcdn.fit2(x = x, y = y, n.trials = 1,
n.hidden = models[[i]]$n.hidden,
Th = models[[i]]$Th,
fixed = models[[i]]$fixed)
# We slightly modified the function to hide unuseful messages
}
## Select best model
models.AICc <- round(sapply(weights.models, attr, which = "AICc"), 3)
# Comparing the AICc, we confirm that shape parameter must be held fixed.
# But last model seems also good...
models.BIC <- round(sapply(weights.models, attr, which = "BIC"), 3)
# Clear evidence for the 5th model (simple linear trend in location parameter)
# BIC penalizes more the more complex models --> parsimony
weights.best <- weights.models[[which.min(models.AICc)]]
parms.best <- gevcdn.evaluate(x, weights.best)
print(data.frame(AICc = models.AICc, BIC = models.BIC))## AICc BIC
## 1 -19.559 -11.513
## 2 -37.389 -26.735
## 3 -35.405 -22.183
## 4 -34.197 -18.446
## 5 -35.439 -19.688
## 6 -37.393 -14.309
## 7 -36.163 -17.925
## 8 -35.180 -7.429
## 9 -33.985 -13.302
## 10 -28.349 3.879
You can see the table in slide 3 (from the “Atelier”) which summarize more smoothly the results.
Then, we can for example compare our models using the deviance statistics :
## stationary vs linear trend
nll1 <- attr(weights.models[[1]], "NLL")
nll5 <- attr(weights.models[[5]], "NLL")
pchisq( 2 *( (-nll5) - (-nll1) ), lower.tail = F,
df = attr(weights.models[[5]], "k") - attr(weights.models[[1]], "k"))## [1] 5.293604e-05
\(\approx\) exactly the same result as previously done. That is, the linear trend model is clearly significant against the stationnary model. Modelling more complex (nonlinear) relations between time and the behaviour of the extremes do not seem to be statistically important for our task. See e.g. \(AIC_c\) and BIC above.
From the best model, we can now retrieve for example some quantiles (…)
q.best <- sapply(c(0.1, 0.5, 0.9), qgev,
location = parms.best[,"location"],
scale = parms.best[,"scale"],
shape = parms.best[,"shape"])However, we see that the results vary : this is due to the random initilization of the weights (…)
A well known ensemble procedure enables to tackle this issue \(\Rightarrow\)
Bootstrap aggregating is a process of fitting an ensemble of bootstrapped models and thus to construct more precise (less variable) estimates (…)
We allow here for only 1 hidden node. We have see previously that it is not necessary to introduce too much complexity.
n.boot <- 10 # Increase value
NOMSG( weights.on <- gevcdn.bag(x = x, y = y, iter.max = 100,
iter.step = 10, n.bootstrap = n.boot,
n.hidden = 1) )
parms.on <- lapply(weights.on, gevcdn.evaluate, x = x)
# parms <- list()
# for (i in 1:n.boot){
# parms[[i]] <- apply(parms.on[[i]],2, mean)
# }
# parms <- lapply(parms.on, 2, FUN = mean) # Try with this
parms <- matrix(0, nrow = nrow(parms.on[[1]]), ncol = 3)
for (i in 1:n.boot){
parms <- parms + as.matrix(parms.on[[i]])
}
parms <- parms / length(parms.on)
## All The parameters of the model will vary through time
q <- t(sapply(max_years$data, quantile, probs = c(.1, .5, .9)))
q.10.on <- q.50.on <- q.90.on <- c()
for(i in seq_along(parms.on)){
q.10.on <- cbind(q.10.on, VGAM::qgev(p = 0.1,
location = parms.on[[i]][,"location"],
scale = parms.on[[i]][,"scale"],
shape = parms.on[[i]][,"shape"]))
q.50.on <- cbind(q.50.on, VGAM::qgev(p = 0.5,
location = parms.on[[i]][,"location"],
scale = parms.on[[i]][,"scale"],
shape = parms.on[[i]][,"shape"]))
q.90.on <- cbind(q.90.on, VGAM::qgev(p = 0.9,
location = parms.on[[i]][,"location"],
scale = parms.on[[i]][,"scale"],
shape = parms.on[[i]][,"shape"]))
}
## Plot data and quantiles
matplot(cbind(y, q, rowMeans(q.10.on), rowMeans(q.50.on),
rowMeans(q.90.on)), type = c("b", rep("l", 6)),
lty = c(1, rep(c(1, 2, 1), 2)),
lwd = c(1, rep(c(3, 2, 3), 2)),
col = c("red", rep("orange", 3), rep("blue", 3)),
pch = 19, xlab = "x", ylab = "y",
main = "gevcdn.bag (early stopping on)") # Change for a ggplot ?We see the evolution through time of the quantiles (blue) from the model following hte behaviour of the process (…)
Doing this with the P50 dataset(rainfall measurements) also provided by the IRM could be interesting. We can expect more complex non-stationnary behaviour for this kind of processes.
We found that evdbayes package typically uses the Metropolis-Hastings (MH) algorithm as MCMC sampler. We are aware that this probably not the most efficient algorithm available in the literature, but it is “easy”" to implement and understand. Beware : We do not know if it is either the MH or the Gibbs sampler which is implemented when doing simulations with this package. (Hartmann and Ehlers 2016) state in their article that it is the MH but in the package’s source functions we see that this is rather the Gibbs sampler. We found no information about it somewhere else provided in the package….
However, we will try to (compare and to) rely on other ways than this sole package, e.g.
1. Implement our own functions. The idea is to better understand the “black-box” and the hidden Bayesian’s mechanism, which is difficult when using only package’s functions. Moreoever, it will allow us to implement other algorithm (MH or Gibbs), to have better flexibility, … We will be mainly based on the book (Dey and Yan 2016, chap. 13).
2. Hamiltonian Monte Carlo based mainly on the same article (Hartmann and Ehlers 2016) (…). The objective is then to use the Stan language which makes use of this technique, and which is built with the compiled language c++. This is (really) more efficient and thus would be preferable.
3. revdbayes ? Using sample ratio of uniforms (…) Not yet studied
There are still problems in there … Objective would be to create flexible and goal-oriented functions from this (…)
# Negative log-likelihood for GEV
gev.nloglik = function(mu, sig, xi, data){
y = 1 + (xi * (data - mu))/sig
if((sig < 0) || (min(y) < 0)) {
ans = 1e+06
} else {
term1 = length(data) * logb(sig)
term2 = sum((1 + 1/xi) * logb(y))
term3 = sum(y^(-1/xi))
ans = term1 + term2 + term3
}
ans
}
# Posterior Density Function
# Compute the log_posterior. Be careful to incorporate the fact that
# the distribution can have finite endpoints.
log_post <- function(mu, logsig,xi, data) {
llhd <- -(gev.nloglik(mu = mu, sig = exp(logsig),
xi = xi, data = data))
lprior <- dnorm(mu, sd = 50, log = TRUE)
lprior <- lprior + dnorm(logsig, sd = 50, log = TRUE)
lprior <- lprior + dnorm(xi, sd = 5, log = TRUE)
lprior + llhd
}
############ MEtropolis-Hastlings ###############
#####################################################
# Optimize Posterior Density Function
fn <- function(par, data) -log_post(par[1], par[2], par[3], data)
param <- c(mean(max_years$df$Max),log(sd(max_years$df$Max)), 0.1 )
opt <- optim(param, fn, data = max_years$data,
method="BFGS", hessian = TRUE)
opt <- nlm(fn, param, data = max_years$data,
hessian=T, iterlim = 1e5)
opt
start <- opt$estimate
Sig <- solve(opt$hessian)
c <- (2.4/sqrt(2))^2 * Sig
# Simulation Loop
ev <- eigen(c^2 * Sig)
varmat <- ev$vectors %*% diag(sqrt(ev$values)) %*% t(ev$vectors)
iter <- 2000; out <- matrix(NA, nrow = iter+1, ncol = 3)
dimnames(out) <- list(0:iter, c("mu", "logsig", "xi"))
out[1,] <- start
lpost_old <- log_post(out[1,1], out[1,2], out[1,3], max_years$data)
if(!is.finite(lpost_old))
stop("starting values give non-finite log_post")
acc_rates <- numeric(iter)
for(t in 1:iter) {
prop <- rnorm(3) %*% varmat + out[t,] # add tuning parameter delta ?
lpost_prop <- log_post(prop[1], prop[2], prop[3], data)
r <- exp(lpost_prop - lpost_old) # as the prop is symmetric
if(r > runif(1)) {
out[t+1,] <- prop
lpost_old <- lpost_prop
}
else out[t+1,] <- out[t,]
acc_rates[t] <- min(r, 1)
}
########## GIBBS sampler #####################
###############################################
# Optimize Posterior Density Function
# fn <- function(par, data) -log_post(par[1], par[2], par[3], data)
# opt <- nlm(fn, param, data = max_years$data,
# hessian=T, iterlim = 1e5)
# opt$estimate
# Simulation Loop
#prop_var <- sqrt( (2.4/sqrt(1))^2 * solve(opt$hessian) )
propsd <- c(.4,.1, .1) # As proposed by Coles, but tune(?) it !
iter <- 1000
out <- data.frame(mu = rep(NA, iter+1),
logsig = rep(NA, iter+1),
xi = rep(NA, iter+1))
out[1,] <- opt$estimate
out <- cbind.data.frame(out, iter = 1:(iter+1))
lpost_old <- log_post(out[1,1], out[1,2], out[1,3], data)
if(!is.finite(lpost_old))
stop("starting values give non-finite log_post")
acc_rates <- matrix(NA, nrow = iter, ncol = 3)
data <- max_years$data
for(t in 1:iter) {
prop1 <- rnorm(1, mean = out[t,1], propsd[1]) # symmetric too
# so that it removes in the ratio.
lpost_prop <- log_post(prop1, out[t,2], out[t,3], data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,1] <- prop1
lpost_old <- lpost_prop
}
else out[t+1,1] <- out[t,1]
acc_rates[t,1] <- min(r, 1)
prop2 <- rnorm(1, mean = out[t,2], propsd[2])
lpost_prop <- log_post(out[t+1,1], prop2, out[t,3], data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,2] <- prop2
lpost_old <- lpost_prop
}
else out[t+1,2] <- out[t,2]
acc_rates[t,2] <- min(r, 1)
prop3 <- rnorm(1, mean = out[t,3], propsd[3])
lpost_prop <- log_post(out[t+1,1],out[t+1,2], prop3, data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,3] <- prop3
lpost_old <- lpost_prop
}
else out[t+1,3] <- out[t,3]
acc_rates[t,3] <- min(r, 1)
}
mean_acc <- apply(acc_rates, 2, mean)
mean_acc
# Do not forget Burn in period
burn <- iter/4
out <- out[-(1:burn),]
library(ggplot2)
library(gridExtra)
g.mu <- ggplot(out) + geom_line(aes(x = iter, y = mu))
g.logsig <- ggplot(out) + geom_line(aes(x = iter, y = logsig))
g.xi <- ggplot(out) + geom_line(aes(x = iter, y = xi))
grid.arrange(g.mu,g.logsig,g.xi, nrow = 3)
param_gibbs <- apply(out[,1:3], 2, mean)
param_gibbs
#####################################################################
################### Gibbs Sampler with Nonstationarity
log_post <- function(mu0, mu1, logsig, xi, data) {
theta <- c(mu0, mu1, logsig, xi)
tt <- ( min(max_years$df$Year):max(max_years$df$Year) -
mean(max_years$df$Year) ) / length(max_years$data)
mu <- mu0 + mu1 * tt # sig <- exp(-delta) * mu
#if(any(sig <= 0)) return(-Inf)
llhd1 <- sum(evd::dgev(data, loc = mu, scale = exp(logsig), xi,
log = TRUE))
#llhd2 <- sum(log(pgev(-data[cs], loc = -mu[cs], scale = sig[cs], xi)))
mnpr <- c(30,0,0,0) ; sdpr <- c(40,40,10,10)
lprior <- sum(dnorm(theta, mean = mnpr, sd = sdpr, log = TRUE))
# lprior <- dnorm(mu0, sd = 50, log = TRUE)
# lprior <- dnorm(mu1, sd = 50, log = TRUE)
# lprior <- lprior + dnorm(logsig, sd = 50, log = TRUE)
# lprior <- lprior + dnorm(xi, sd = 5, log = TRUE)
lprior + llhd1 #+ llhd2
}
fn <- function(par, data) -log_post(par[1], par[2], par[3],
par[4], data)
param <- c(mean(max_years$df$Max), 0, log(sd(max_years$df$Max)), -0.1 )
opt <- optim(param, fn, data = max_years$data,
method = "BFGS", hessian = T)
opt <- nlm(fn, param, data = max_years$data,
hessian=T, iterlim = 1e5)
opt
# Starting Values
set.seed(100)
start <- list(); k <- 1
while(k < 5) { # starting value is randomly selected from a distribution
# that is overdispersed relative to the target
sv <- as.numeric(rmvnorm(1, opt$estimate, 50 * solve(opt$hessian)))
svlp <- log_post(sv[1], sv[2], sv[3], sv[4], data)
print(svlp)
if(is.finite(svlp)) {
start[[k]] <- sv
k <- k + 1
}
}
# Simulation Loop : 4 Chains With Different Starting Values
set.seed(100)
for(k in 1:4) {
propsd <- c(1.3, 8.5, .05, .05) # Trial-and-error method
iter <- 1000; out <- data.frame(mu0 = rep(NA, iter+1),
mu1 = rep(NA, iter+1),
logsig = rep(NA, iter+1),
xi = rep(NA, iter+1))
out[1,] <- start[[k]]
lpost_old <- log_post(out[1,1], out[1,2], out[1,3], out[1,4], data)
if(!is.finite(lpost_old))
stop("starting values give non-finite log_post")
acc_rates <- matrix(NA, nrow = iter, ncol = 4)
for(t in 1:iter) {
prop1 <- rnorm(1, mean = out[t,1], propsd[1])
lpost_prop <- log_post(prop1, out[t,2], out[t,3], out[t,4], data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,1] <- prop1
lpost_old <- lpost_prop
}
else out[t+1,1] <- out[t,1]
acc_rates[t,1] <- min(r, 1)
prop2 <- rnorm(1, mean = out[t,2], propsd[2])
lpost_prop <- log_post(out[t+1,1], prop2, out[t,3], out[t,4], data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,2] <- prop2
lpost_old <- lpost_prop
}
else out[t+1,2] <- out[t,2]
acc_rates[t,2] <- min(r, 1)
prop3 <- rnorm(1, mean = out[t,3], propsd[3])
lpost_prop <- log_post(out[t+1,1], out[t+1,2], prop3, out[t,4], data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,3] <- prop3
lpost_old <- lpost_prop
}
else out[t+1,3] <- out[t,3]
acc_rates[t,3] <- min(r, 1)
prop4 <- rnorm(1, mean = out[t,4], propsd[4])
lpost_prop <- log_post(out[t+1,1], out[t+1,2], out[t+1,3], prop4, data)
r <- exp(lpost_prop - lpost_old)
if(r > runif(1)) {
out[t+1,4] <- prop4
lpost_old <- lpost_prop
}
else out[t+1,4] <- out[t,4]
acc_rates[t,4] <- min(r, 1)
}
assign(paste0("out", k), out)
assign(paste0("acc_rates", k), acc_rates)
}
mean_acc <- apply(acc_rates, 2, mean)
mean_acc
# Combine Chains And Remove Burn-In Period
hf <- ceiling(iter/2 + 1)
out <- rbind(out1[-(1:hf), ], out2[-(1:hf), ],
out3[-(1:hf), ], out4[-(1:hf), ])
out <- cbind.data.frame(out, iter = (1:nrow(out)))
g.mu0 <- ggplot(out) + geom_line(aes(x = iter, y = mu0))
g.mu1 <- ggplot(out) + geom_line(aes(x = iter, y = mu1))
g.logsig <- ggplot(out) + geom_line(aes(x = iter, y = logsig))
g.xi <- ggplot(out) + geom_line(aes(x = iter, y = xi))
grid.arrange(g.mu0, g.mu1, g.logsig, g.xi, nrow = 4)
# Markov Chain Correlations
autocorr(mcmc(out1))
crosscorr(mcmc(out1))
autocorr.diag(mcmc(out1))
autocorr.plot(mcmc(out1))
crosscorr.plot(mcmc(out1))
#### See end of bayesian01 for predictive dist, quantiles, and other...Globally it is the first package to use directly for EVT. It is maybe a bit old (more than 10 years…), so it is not really efficient but it works well. This package is also interesting as it easily allows for a trend, and as we have seen it is important for our case.
library(evdbayes)
var.prior <- diag(c(10000, 10000, 100))
## Large variance prior : near flat (vague) uninformative priors
pn <- prior.norm(mean = c(0,0,0), cov = var.prior)
## Arbitray starting values in t_0
n <- 1000 ; t0 <- c(31, 1 ,0) ; s <- c(.02,.1,.1)
## s contains sd for proposal distributions. Tune it
max.mc1 <- posterior(n, t0, prior = pn, lh = "gev", data = max_years$data, psd = s)
library(coda)
mcmc.max1 <- mcmc (max.mc1, start = 0, end = 1000)
#plot(mcmc.max1, den = F, sm = F)
### Better to optimize the posterior to find better starting values for MCMC
maxpst <- mposterior(init = t0, prior = pn, lh = "gev", method = "BFGS",
data = max_years$data) # Optimization method, like SANN does not change anything
start.val <- maxpst$par
## use this a initial value, then redo the analysis (...)
max.mc2 <- posterior(n, start.val, prior = pn, lh = "gev",
data = max_years$data, psd = s)
mcmc.max2 <- mcmc (max.mc2, start = 0, end = 1000)
#plot(mcmc.max2, den = F, sm = F)Problem : Mixing of the chains is poor (not shown), especially for \(\mu\). Change the standard deviation of the proposal distribution (psd). We also add a necessary (and arbitrary for now) burn-in period.
psd <- invisible( ar.choice(init = t0, prior = pn, lh = "gev", data = max_years$data,
psd = rep(0.05,3), tol = rep(0.02, 3)) ) $psd
max.mc3 <- posterior(2000, start.val, prior = pn, lh = "gev",
data = max_years$data, psd = psd)
mcmc.maxF <- mcmc (max.mc3, start = 500, end = 2000)
plot(mcmc.maxF, den = F, sm = F)The chains seem pretty well. Posterior distribution has probably reached stationarity but before going on with MC diagnostics, we will see other “methods”.
We add a bit more iterations and burn-in period as it is not to computationally expensive. We set the threshold that we found previously (30)
#igamma(shape = c(38.9,7.1,47), scale = c(1.5,6.3,2.6))
priorpp <- prior.quant(shape = c(0.05, 0.25, 0.1), scale = c(1.5,3,2)) # change
n <- 10000 ; b <- 2000 # change
rn.prior <- posterior(n, t0, priorpp, "none", psd = psd, burn = b)
#t0 <- c(43.2, 7.64, 0.32) ; s <- c(2, .2, .07)
rn.post <- posterior(n, maxpst$par, priorpp, data = max_all, "pp", thresh = 30,
noy = 116, psd = s, burn = b)
plot(mc.post <- mcmc(rn.post, start = 0, end = 8000))## Comparison with frequentist result previously obtained
apply(rn.post, 2, mean)## mu sigma xi
## 31.7826246 1.6215415 -0.2713979
gev_tx1$estimate## loc scale shape
## 30.5869782 2.0808010 -0.2543713
It seems good as well but again a (strong) autocorrelation in \(\mu\). Must handle this by playing with proposal standard deviations,… However, The result is similar (but still not the same) as the frequentist. We still do not assess convergence before going through with the nonstationary model.
pn.t <- prior.norm(mean = c(0,0,0), cov = var.prior, trendsd = 500)
rn.post.t <- posterior(10000, c(maxpst$par,1), pn.t, data = max_all, "pp", thresh = 32,
noy = 116, psd = c(s,.25), burn = b)
summary(rn.post.t)## mu sigma xi mutrend
## Min. :31.07 Min. :1.034 Min. :-0.46107 Min. :-19.9227
## 1st Qu.:31.60 1st Qu.:1.544 1st Qu.:-0.30838 1st Qu.: -7.4959
## Median :31.71 Median :1.673 Median :-0.26630 Median : 0.2643
## Mean :31.70 Mean :1.683 Mean :-0.26017 Mean : -1.3030
## 3rd Qu.:31.81 3rd Qu.:1.814 3rd Qu.:-0.21699 3rd Qu.: 4.3961
## Max. :32.10 Max. :2.412 Max. : 0.05617 Max. : 11.5324
plot(mc.post.t <- mcmc(rn.post.t, start = 0, end = 8000))Problem for the parameter associated with mutrend. Too much autocorrelations through the chain We then try to handle that introduce thinning : i.e., we only take one simulation every thin simulation
thin.post <- posterior(10000, c(maxpst$par,1), pn.t, data = max_all, "pp", thresh = 32,
noy = 116, psd = c(s, .25), burn = b, thin = 5)
summary(thin.post) ## mu sigma xi mutrend
## Min. :31.15 Min. :1.064 Min. :-0.46296 Min. :-22.284
## 1st Qu.:31.57 1st Qu.:1.532 1st Qu.:-0.31880 1st Qu.:-12.172
## Median :31.68 Median :1.688 Median :-0.26884 Median :-10.191
## Mean :31.66 Mean :1.706 Mean :-0.26216 Mean : -9.975
## 3rd Qu.:31.77 3rd Qu.:1.849 3rd Qu.:-0.21638 3rd Qu.: -7.451
## Max. :32.09 Max. :2.579 Max. : 0.07671 Max. : 1.534
plot(mc.post <- mcmc(thin.post, start = , end = nrow(thin.post)))However, this does not change anything… We must handle that. Inferences with trend are not reliable.
#rl.pred(max.mc2, npy = 365.25, qlim = c(30,45))
#rl.pred(max.mc3, qlim = c(30,45))To be continued
## simulate data from fitted GEV
sim <- rextRemes(gev_tx2, 10000)
# fit_sim <- fevd(sim)
# plot(fit_sim)To be continued…
Beirlant, Jan, Yuri Goegebeur, Johan Segers, and Jozef Teugels. 2006. Statistics of Extremes: Theory and Applications. John Wiley & Sons.
Dey, Dipak K., and Jun Yan. 2016. Extreme Value Modeling and Risk Analysis: Methods and Applications. CRC Press.
Hartmann, Marcelo, and Ricardo Ehlers. 2016. “Bayesian Inference for Generalized Extreme Value Distributions via Hamiltonian Monte Carlo.” Communications in Statistics - Simulation and Computation, March, 0–0. doi:10.1080/03610918.2016.1152365.
Scarrott, Carl, and Anna MacDonald. 2012. “A Review of Extreme Value Threshold Es-Timation and Uncertainty Quantification.” REVSTAT–Statistical Journal 10 (1): 33–60. https://www.ine.pt/revstat/pdf/rs120102.pdf.